The data comes from a project known as Tidy Tuesday. We are looking at a data set from a few weeks ago, you can read more about it here.
office_ratings <-
readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv')
## Parsed with column specification:
## cols(
## season = col_double(),
## episode = col_double(),
## title = col_character(),
## imdb_rating = col_double(),
## total_votes = col_double(),
## air_date = col_date(format = "")
## )
# office_ratings <-
# read.csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv', stringsAsFactors = F)
Note that readr::read_csv, uses the read_csv from the readr library, but without loading in the library.
office_ratings_baseR <-
read.csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv', stringsAsFactors = F)
office_ratings_baseR$air_date <- as.Date(office_ratings_baseR$air_date[1:4])
office_ratings
## # A tibble: 188 x 6
## season episode title imdb_rating total_votes air_date
## <dbl> <dbl> <chr> <dbl> <dbl> <date>
## 1 1 1 Pilot 7.6 3706 2005-03-24
## 2 1 2 Diversity Day 8.3 3566 2005-03-29
## 3 1 3 Health Care 7.9 2983 2005-04-05
## 4 1 4 The Alliance 8.1 2886 2005-04-12
## 5 1 5 Basketball 8.4 3179 2005-04-19
## 6 1 6 Hot Girl 7.8 2852 2005-04-26
## 7 2 1 The Dundies 8.7 3213 2005-09-20
## 8 2 2 Sexual Harassment 8.2 2736 2005-09-27
## 9 2 3 Office Olympics 8.4 2742 2005-10-04
## 10 2 4 The Fire 8.4 2713 2005-10-11
## # … with 178 more rows
Will begin by looking at ratings, and best way to explore is by plotting the data.
library(ggplot2)
office_ratings %>%
ggplot(aes(imdb_rating)) +
geom_histogram() # or stat_bin
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Split by season
office_ratings %>%
ggplot(aes(imdb_rating)) +
geom_histogram() +
facet_wrap(~season)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Tidy up/make nice
office_ratings %>%
ggplot(aes(imdb_rating)) +
geom_histogram(bins = 10, colour = "cornflowerblue") +
facet_wrap(~season) +
labs(title = "IMDB Rating by Season",
x = "Rating", y = "Count")
# Try different bin size
office_ratings %>%
ggplot(aes(imdb_rating)) +
geom_histogram(bins = 30, fill = "cornflowerblue") +
facet_wrap(~season) +
labs(title = "IMDB Rating by Season",
x = "Rating", y = "Count")
# Improve Labels
my_prefix <- function(string, prefix = "Season ") paste0(prefix, string)
office_ratings %>%
ggplot(aes(imdb_rating)) +
geom_histogram(bins = 30, fill = "cornflowerblue") +
facet_wrap(~season, labeller = as_labeller(my_prefix)) +
labs(title = "IMDB Rating by Season",
x = "Rating", y = "Count") +
theme_light()
Next will look at boxplots
# Simple boxplot
office_ratings %>%
ggplot(aes(x = season, y = imdb_rating, group = season)) +
geom_boxplot()
# Fix labels on x axis
office_ratings %>%
ggplot(aes(x = season, y = imdb_rating, group = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season",
y = "IMDB Rating", x = "Season") +
theme_light() +
scale_x_continuous(breaks = 1:max(office_ratings$season))
# Try to add colour
office_ratings %>%
ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season",
y = "IMDB Rating", x = "Season") +
theme_light() +
scale_x_continuous(breaks = 1:max(office_ratings$season))
# Need to change season to factor
# office_ratings$season <- as.factor(office_ratings$season)
# office_ratings$season <- office_ratings %>% mutate(season = as.factor(season))
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season",
y = "IMDB Rating", x = "Season") +
theme_light()
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season",
y = "IMDB Rating", x = "Season") +
theme_light() + theme(legend.position = "none")
Seasons 3 and 4 look good, peaks again at 7. Last season bad apart from 3 outliers.
After Season 7, Micheal (lead character - Steve Carell) leaves the show.
# ?geom_vline
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season",
y = "IMDB Rating", x = "Season") +
theme_light() + theme(legend.position = "none") +
geom_vline(xintercept=7.5, linetype = "dashed")
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season",
y = "IMDB Rating", x = "Season") +
theme_light() + theme(legend.position = "none") +
geom_vline(xintercept = 7.5, linetype = "dashed") +
annotate("text", x = 8, y = 5, label = "Micheal Scott Leaves")
office_ratings %>%
group_by(season) %>%
summarise(mean_rating = mean(imdb_rating),
highest_rating = max(imdb_rating),
lowest_rating = min(imdb_rating),
variance = var(imdb_rating))
## # A tibble: 9 x 5
## season mean_rating highest_rating lowest_rating variance
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 8.02 8.4 7.6 0.0937
## 2 2 8.44 9.3 7.9 0.101
## 3 3 8.57 9.3 8 0.117
## 4 4 8.6 9.3 7.9 0.155
## 5 5 8.49 9.6 8.1 0.133
## 6 6 8.22 9.3 6.8 0.252
## 7 7 8.32 9.7 7.5 0.350
## 8 8 7.67 8.2 6.7 0.164
## 9 9 7.96 9.7 7.1 0.385
library(xtable)
office_ratings %>%
group_by(season) %>%
summarise(mean_rating = mean(imdb_rating),
highest_rating = max(imdb_rating),
lowest_rating = min(imdb_rating),
variance = var(imdb_rating)) %>%
xtable() %>%
print(type = "html")
| season | mean_rating | highest_rating | lowest_rating | variance | |
|---|---|---|---|---|---|
| 1 | 1.00 | 8.02 | 8.40 | 7.60 | 0.09 |
| 2 | 2.00 | 8.44 | 9.30 | 7.90 | 0.10 |
| 3 | 3.00 | 8.57 | 9.30 | 8.00 | 0.12 |
| 4 | 4.00 | 8.60 | 9.30 | 7.90 | 0.16 |
| 5 | 5.00 | 8.49 | 9.60 | 8.10 | 0.13 |
| 6 | 6.00 | 8.22 | 9.30 | 6.80 | 0.25 |
| 7 | 7.00 | 8.32 | 9.70 | 7.50 | 0.35 |
| 8 | 8.00 | 7.67 | 8.20 | 6.70 | 0.16 |
| 9 | 9.00 | 7.96 | 9.70 | 7.10 | 0.39 |
office_ratings %>%
group_by(season) %>%
summarise(mean_rating = mean(imdb_rating),
highest_rating = max(imdb_rating),
lowest_rating = min(imdb_rating),
variance = var(imdb_rating)) %>%
xtable() %>%
print(type = "html", include.rownames = F)
| season | mean_rating | highest_rating | lowest_rating | variance |
|---|---|---|---|---|
| 1.00 | 8.02 | 8.40 | 7.60 | 0.09 |
| 2.00 | 8.44 | 9.30 | 7.90 | 0.10 |
| 3.00 | 8.57 | 9.30 | 8.00 | 0.12 |
| 4.00 | 8.60 | 9.30 | 7.90 | 0.16 |
| 5.00 | 8.49 | 9.60 | 8.10 | 0.13 |
| 6.00 | 8.22 | 9.30 | 6.80 | 0.25 |
| 7.00 | 8.32 | 9.70 | 7.50 | 0.35 |
| 8.00 | 7.67 | 8.20 | 6.70 | 0.16 |
| 9.00 | 7.96 | 9.70 | 7.10 | 0.39 |
office_ratings %>%
mutate(season = as.factor(season)) %>%
group_by(season) %>%
summarise(mean_rating = mean(imdb_rating),
highest_rating = max(imdb_rating),
lowest_rating = min(imdb_rating),
variance = var(imdb_rating)) %>%
rename(Season = season,
Mean = mean_rating,
Higest = highest_rating,
Lowest = lowest_rating,
Variance = variance) %>%
xtable(caption = "Summary Statistics of Rating by Season") %>%
print(type = "html", include.rownames = F)
| Season | Mean | Higest | Lowest | Variance |
|---|---|---|---|---|
| 1 | 8.02 | 8.40 | 7.60 | 0.09 |
| 2 | 8.44 | 9.30 | 7.90 | 0.10 |
| 3 | 8.57 | 9.30 | 8.00 | 0.12 |
| 4 | 8.60 | 9.30 | 7.90 | 0.16 |
| 5 | 8.49 | 9.60 | 8.10 | 0.13 |
| 6 | 8.22 | 9.30 | 6.80 | 0.25 |
| 7 | 8.32 | 9.70 | 7.50 | 0.35 |
| 8 | 7.67 | 8.20 | 6.70 | 0.16 |
| 9 | 7.96 | 9.70 | 7.10 | 0.39 |
office_ratings
## # A tibble: 188 x 6
## season episode title imdb_rating total_votes air_date
## <dbl> <dbl> <chr> <dbl> <dbl> <date>
## 1 1 1 Pilot 7.6 3706 2005-03-24
## 2 1 2 Diversity Day 8.3 3566 2005-03-29
## 3 1 3 Health Care 7.9 2983 2005-04-05
## 4 1 4 The Alliance 8.1 2886 2005-04-12
## 5 1 5 Basketball 8.4 3179 2005-04-19
## 6 1 6 Hot Girl 7.8 2852 2005-04-26
## 7 2 1 The Dundies 8.7 3213 2005-09-20
## 8 2 2 Sexual Harassment 8.2 2736 2005-09-27
## 9 2 3 Office Olympics 8.4 2742 2005-10-04
## 10 2 4 The Fire 8.4 2713 2005-10-11
## # … with 178 more rows
office_ratings %>%
ggplot(aes(x = air_date, y = total_votes)) +
geom_line()
office_ratings %>%
ggplot(aes(x = air_date, y = total_votes)) +
geom_line(colour = "purple") +
labs(title = "Number of IMDB Votes Over Time",
x = "Date", y = "Total Number of Votes")
office_ratings %>%
mutate(season = as.factor(season)) %>%
group_by(season) %>%
summarise(mean_rating = mean(total_votes),
highest_rating = max(total_votes),
lowest_rating = min(total_votes),
variance = var(total_votes)) %>%
rename(Season = season,
Mean = mean_rating,
Higest = highest_rating,
Lowest = lowest_rating,
Variance = variance) %>%
xtable(caption = "Summary Statistics of Total Votes by Season") %>%
print(type = "html", include.rownames = F)
| Season | Mean | Higest | Lowest | Variance |
|---|---|---|---|---|
| 1 | 3195.33 | 3706.00 | 2852.00 | 131418.27 |
| 2 | 2630.64 | 3644.00 | 2323.00 | 118547.67 |
| 3 | 2443.17 | 3087.00 | 2254.00 | 54387.97 |
| 4 | 2422.57 | 4095.00 | 1977.00 | 314309.19 |
| 5 | 2150.73 | 5948.00 | 1808.00 | 631845.08 |
| 6 | 1856.54 | 3579.00 | 1571.00 | 153722.18 |
| 7 | 2030.96 | 5749.00 | 1581.00 | 819589.52 |
| 8 | 1546.38 | 1829.00 | 1393.00 | 10059.38 |
| 9 | 1852.61 | 7934.00 | 1394.00 | 1854068.43 |
library(knitr)
office_ratings %>%
mutate(season = as.factor(season)) %>%
group_by(season) %>%
summarise(mean_rating = mean(total_votes),
highest_rating = max(total_votes),
lowest_rating = min(total_votes),
variance = var(total_votes)) %>%
rename(Season = season,
Mean = mean_rating,
Higest = highest_rating,
Lowest = lowest_rating,
Variance = variance) -> votes_summary_by_season
kable(votes_summary_by_season)
| Season | Mean | Higest | Lowest | Variance |
|---|---|---|---|---|
| 1 | 3195.333 | 3706 | 2852 | 131418.27 |
| 2 | 2630.636 | 3644 | 2323 | 118547.67 |
| 3 | 2443.174 | 3087 | 2254 | 54387.97 |
| 4 | 2422.571 | 4095 | 1977 | 314309.19 |
| 5 | 2150.731 | 5948 | 1808 | 631845.08 |
| 6 | 1856.538 | 3579 | 1571 | 153722.18 |
| 7 | 2030.958 | 5749 | 1581 | 819589.52 |
| 8 | 1546.375 | 1829 | 1393 | 10059.38 |
| 9 | 1852.609 | 7934 | 1394 | 1854068.43 |
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(votes_summary_by_season, caption = "Summary Statistics of Total Votes by Season") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Season | Mean | Higest | Lowest | Variance |
|---|---|---|---|---|
| 1 | 3195.333 | 3706 | 2852 | 131418.27 |
| 2 | 2630.636 | 3644 | 2323 | 118547.67 |
| 3 | 2443.174 | 3087 | 2254 | 54387.97 |
| 4 | 2422.571 | 4095 | 1977 | 314309.19 |
| 5 | 2150.731 | 5948 | 1808 | 631845.08 |
| 6 | 1856.538 | 3579 | 1571 | 153722.18 |
| 7 | 2030.958 | 5749 | 1581 | 819589.52 |
| 8 | 1546.375 | 1829 | 1393 | 10059.38 |
| 9 | 1852.609 | 7934 | 1394 | 1854068.43 |
# https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html
office_ratings %>%
ggplot(aes(x = imdb_rating, y = total_votes)) +
geom_point()
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
geom_point()
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
geom_point() +
labs(title = "Votes vs Rating", x = "Rating", y = "Total Number of Votes", colour = "Season")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
geom_point() +
labs(title = "Votes vs Rating",
x = "Rating",
y = "Total Number of Votes",
colour = "Season") -> votes_vs_rating
ggplotly(votes_vs_rating)
votes_vs_rating2 <-
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season,
text = paste0(title,
"<br>IMDB: ", imdb_rating,
"<br>No. Votes: ", total_votes,
"<br>Season: ", season))) +
geom_point() +
labs(title = "IMDB Votes vs Rating", x = "Rating", y = "Total Number of Votes", colour = "Season")
ggplotly(votes_vs_rating2, tooltip = "text")
votes_vs_rating3 <-
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season,
text = paste0("S", season, ".E", episode, " ", title,
"<br>IMDB: ", imdb_rating,
"<br>No. Votes: ", total_votes))) +
geom_point() +
labs(title = "IMDB Votes vs Rating", x = "Rating", y = "Total Number of Votes", colour = "Season")
ggplotly(votes_vs_rating3, tooltip = "text")